home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Collection of Tools & Utilities
/
Collection of Tools and Utilities.iso
/
edit
/
tde40.zip
/
regx.c
< prev
next >
Wrap
C/C++ Source or Header
|
1994-06-05
|
38KB
|
1,302 lines
/*
* These functions compile a regular expression into an NFA and recognize
* a pattern.
*
* Regular expressions are often used to describe a pattern that might
* match several different or similar words. An example of a regular
* expression subset is the wild card characters '*' and '?' used to list
* files in a directory.
*
* Might as well read the books and papers written by those who first wrote
* the various [a-z]?greps. Ken Thompson was the author of grep. In one
* weekend, Al Aho wrote egrep and fgrep. Incidentally, when Dr. Aho was
* the guest speaker at the Distinguished Lecture Series at Georgia Tech,
* he personally autographed my copy of his book _Compilers: Principles,
* Techniques, and Tools_, aka the "Dragon Book."
*
* See:
*
* Ken Thompson, "Regular Expression Search Algorithm." _Communications
* of the ACM_, 11 (No. 6): 419-422, 1968.
*
* Alfred V. Aho, Ravi Sethi, and Jeffrey D. Ullman, _Compilers: Principles,
* Techniques, and Tools_, Addison-Wesley, Reading, Mass., 1986,
* Chapter 3, "Lexical Analysis", pp 83-158. ISBN 0-201-10088-6.
*
* Robert Sedgewick, _Algorithms in C_, Addison-Wesley, Reading, Mass.,
* 1990, Chapter 20, "Pattern Matching", pp. 293-303, and Chapter 21,
* "Parsing", pp. 305-317. ISBN 0-201-51425-7.
*
* Andrew Hume, "A Tale of Two Greps." _Software--Practice and Experience_,
* 18 (No. 11): 1063-1072, 1988.
*
*
* Regular Expressions in TDE
*
* The core of the regular expression search algorithm in TDE is based on
* the nondeterministic finite-state machine in Dr. Segdewick's book.
* Additional parsing, operator precedence, and nfa construction and
* simulation info is from Dr. Aho's book. The nfa node naming convention
* and additional nfa construction guidelines are from Ken Thompson's paper.
* I'm much too stupid to compile a regular expression into assembly language
* as suggested in Ken Thompson's paper, but I did get the idea to do the
* next best thing and added C library functions as NNODE terminal types.
*
* The local global NFA is builded with two arrays and a deque. The
* first array, next1, in the NFA is used to hold the path for lambda
* transitions. The second array, next2, is used to hold alternate paths.
* Next1 can hold all types of nodes, but it gets priority for lambda
* transitions. The deque is a circular array where future states are queued
* in the head and present states are pushed in the tail.
*
* Searching for regular expressions in TDE is not very fast. The worst
* case search, .*x, where x is any word or letter, is probably the slowest
* of any implementation with a regular expression search function. However,
* TDE can handle a large regular expression subset.
*
* In version 3.1, I added BOW and EOW (beginning of word and end of word.)
*
* In version 3.2, the macro letters were moved to letters.h per Byrial Jensen.
* We also use the bj_isxxx( ) functions to test whether a letter falls into
* a predefined macro class.
*
* New editor name: TDE, the Thomson-Davis Editor.
* Author: Frank Davis
* Date: June 5, 1991, version 1.0
* Date: July 29, 1991, version 1.1
* Date: October 5, 1991, version 1.2
* Date: January 20, 1992, version 1.3
* Date: February 17, 1992, version 1.4
* Date: April 1, 1992, version 1.5
* Date: June 5, 1992, version 2.0
* Date: October 31, 1992, version 2.1
* Date: April 1, 1993, version 2.2
* Date: June 5, 1993, version 3.0
* Date: August 29, 1993, version 3.1
* Date: November 13, 1993, version 3.2
* Date: June 5, 1994, version 4.0
*
* This code is released into the public domain, Frank Davis.
* You may use and distribute it freely.
*/
#include "tdestr.h"
#include "common.h"
#include "tdefunc.h"
#include "define.h"
#ifndef min
#define min(a,b) (((a) < (b)) ? (a) : (b))
#endif
/*
* types of nodes in the NFA.
*
* let's use the node naming convention in Ken Thompson's reg ex paper.
*/
#define CNODE 0
#define NNODE 1
#define SCAN -1
/*
* types of terminals in NNODEs.
*
* starting with simple ASCII terminals, it's easy to build in other types of
* terminals, e.g. wild chars, BOL, EOL, and character classes. actually,
* we can easily build ANSI C ctype library function NNODEs.
*/
#define STRAIGHT_ASCII 0
#define IGNORE_ASCII 1
#define WILD 2
#define BOL 3
#define EOL 4
#define CLASS 5
#define NOTCLASS 6
#define WHITESPACE 7
#define ALPHA 8
#define UPPER 9
#define LOWER 10
#define ALPHANUM 11
#define DECIMAL 12
#define HEX 13
#define BOW 14
#define EOW 15
/*
* types of terminals in CNODEs.
*/
#define START 0
#define ACCEPT 1
#define OR_NODE 2
#define JUXTA 3
#define CLOSURE 4
#define ZERO_OR_ONE 5
int lookahead;
int regx_rc;
int reg_len;
int parser_state;
char class_bits[32];
int bit[8] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
int c1;
int c2;
int ttype;
int regx_error_line;
NFA_TYPE *nfa_pointer;
int nfa_status;
int search_len;
int search_col;
text_ptr search_string;
int queued_states[REGX_SIZE+2];
int deque[REGX_SIZE*2];
int dq_head;
int dq_tail;
int stacked_node_count;
int reset_count;
/*
* Name: find_regx
* Purpose: To set up and find a regular expression.
* Date: June 5, 1993
* Passed: window: pointer to current window
* Notes: Keep the regular expression until changed.
*/
int find_regx( TDE_WIN *window )
{
int direction;
int new_string;
long found_line;
long bin_offset;
line_list_ptr ll;
register TDE_WIN *win; /* put window pointer in a register */
int rc;
int old_rcol;
int rcol;
char answer[MAX_COLS+2];
char temp[MAX_COLS+2];
switch (g_status.command) {
case FindRegX :
new_string = TRUE;
direction = FORWARD;
break;
case RepeatFindRegX :
new_string = regx.search_defined != OK ? TRUE : FALSE;
direction = FORWARD;
break;
case RepeatFindRegXBackward :
new_string = regx.search_defined != OK ? TRUE : FALSE;
direction = BACKWARD;
break;
default :
new_string = 0;
direction = 0;
assert( FALSE );
break;
}
win = window;
entab_linebuff( );
if (un_copy_line( win->ll, win, TRUE ) == ERROR)
return( ERROR );
regx_error_line = win->bottom_line;
/*
* get search text, using previous as default
*/
rc = OK;
if (new_string == TRUE) {
*answer = '\0';
if (regx.search_defined == OK) {
assert( strlen( (char *)regx.pattern ) < (size_t)g_display.ncols );
strcpy( answer, (char *)regx.pattern );
}
/*
* string to find:
*/
if (get_name( reg1, win->bottom_line, answer,
g_display.message_color ) != OK || *answer == '\0')
return( ERROR );
regx.search_defined = OK;
assert( strlen( answer ) < (size_t)g_display.ncols );
strcpy( (char *)regx.pattern, answer );
rc = build_nfa( );
if (rc == ERROR)
regx.search_defined = ERROR;
}
if (regx.search_defined == OK) {
old_rcol = win->rcol;
if (mode.inflate_tabs)
win->rcol = entab_adjust_rcol( win->ll->line, win->ll->len, win->rcol);
update_line( win );
show_search_message( SEARCHING, g_display.diag_color );
bin_offset = win->bin_offset;
if (direction == FORWARD) {
if ((ll = forward_regx_search( win, &found_line, &rcol )) != NULL) {
if (g_status.wrapped && g_status.macro_executing)